1 Setup

library(here)
library(tidyverse)
library(plotly)
library(scales)
library(lubridate)

2 Quantitative data

2.1 Read in data

in_file = here::here("data", "fecundity", "20210205_quantitative.xlsx")
df_quant = readxl::read_xlsx(in_file)

2.2 Tidy data

# File to save to 
out_file = here::here("data", "fecundity", "20210208_quant.csv")

# Remove NAs
df_quant = df_quant %>% 
  filter(!is.na(Strain))

# Split into list
quant_list = list("F15" = df_quant[, 1:8],
                  "F16" = df_quant[, c(1, 9:length(df_quant))])

# Pivot longer
out_df = lapply(quant_list, function(generation){
  out = generation %>% 
    # Pivot longer
    tidyr::pivot_longer(cols = contains("eggs"),
                        names_to = "DATE",
                        names_prefix = "eggs ",
                        values_to = "EGG_COUNT") %>% 
    # Convert into date
    dplyr::mutate(DATE = DATE %>% 
                  str_c("2020") %>% 
                  str_replace_all("\\.", "-") %>% 
                  as.Date("%d-%m-%Y")) %>% 
    # Get weekday
    dplyr::mutate(WEEKDAY = weekdays(DATE)) %>% 
    # Rename columns
    dplyr::select(STRAIN = "Strain",
                  FEMALE_COUNT = contains("females"),
                  DATE, WEEKDAY, EGG_COUNT,
                  everything()) %>% 
    # Replace question marks in FEMALE_COUNT and convert to integer
    dplyr::mutate(FEMALE_COUNT = str_replace(FEMALE_COUNT, "\\?", "") %>% 
                    as.integer()) %>% 
    # Get eggs per female
    dplyr::mutate(EGGS_PER_FEMALE = EGG_COUNT / FEMALE_COUNT)
  
  return(out)
}) %>% 
  # bind in to DF
  dplyr::bind_rows(.id = "GENERATION") %>% 
  # Write to CSV
  readr::write_csv(out_file, na = "")

# Adapt variables for plotting
strain_levels = unique(out_df$STRAIN)
pal_primary = hue_pal()(length(strain_levels))
names(pal_primary) = strain_levels

out_df$STRAIN = factor(out_df$STRAIN, levels = strain_levels)
out_df$WEEKDAY = factor(out_df$WEEKDAY,
                        levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

knitr::kable(head(out_df))
GENERATION STRAIN FEMALE_COUNT DATE WEEKDAY EGG_COUNT EGGS_PER_FEMALE
F15 4-1 NA 2020-07-02 Thursday NA NA
F15 4-1 NA 2020-07-03 Friday NA NA
F15 4-1 NA 2020-07-21 Tuesday NA NA
F15 4-1 NA 2020-07-23 Thursday NA NA
F15 4-1 NA 2020-07-28 Tuesday NA NA
F15 4-1 NA 2020-07-30 Thursday NA NA

2.3 Plot

2.3.1 Correlation between F15 and F16

corr_plot = out_df %>% 
  dplyr::group_by(GENERATION, STRAIN) %>% 
  summarise(mean(EGGS_PER_FEMALE)) %>% 
  tidyr::pivot_wider(names_from = GENERATION,
                     values_from = `mean(EGGS_PER_FEMALE)`) %>% 
    ggplot() +
      geom_point(aes(F15, F16, colour = STRAIN)) +
      coord_fixed() +
      theme_bw() +
      guides(colour = F) +
      xlim(0,7) +
      ylim(0,7) +
      ggtitle("Mean eggs per female") +
      scale_color_manual(values = pal_primary)
## `summarise()` has grouped output by 'GENERATION'. You can override using the `.groups` argument.
# Plotly
ggplotly(corr_plot, height = 400, width = 400) %>% 
  layout(showlegend = F)

2.3.2 Effect of day of collection

collection_day_plot = out_df %>% 
  ggplot() +
    geom_point(aes(DATE, EGGS_PER_FEMALE, colour = STRAIN), alpha = 0.8) +
    facet_wrap(vars(GENERATION, WEEKDAY)) +
    guides(colour = F) +
    theme_bw() +
    xlab("Date") +
    ylab("Mean eggs produced per female") +
    scale_color_manual(values = pal_primary)

ggplotly(collection_day_plot, height = 1000, width = 800) %>% 
  layout(showlegend = F)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# Get function to reverse labels in ggplotly (from https://stackoverflow.com/questions/59611914/reverse-the-legend-order-when-using-ggplotly)
reverse_legend_labels <- function(plotly_plot) {
  n_labels <- length(plotly_plot$x$data)
  plotly_plot$x$data[1:n_labels] <- plotly_plot$x$data[n_labels:1]
  plotly_plot
}

violin_plot = out_df %>%
  dplyr::mutate(STRAIN = factor(STRAIN, levels = rev(strain_levels))) %>% 
  ggplot() +
    geom_violin(aes(STRAIN, EGGS_PER_FEMALE, fill = STRAIN, colour = STRAIN)) +
    geom_jitter(aes(STRAIN, EGGS_PER_FEMALE, label = GENERATION), size = 0.25) +
    theme_bw() +
    coord_flip() +
    guides(fill = F, colour = F) +
    xlab("MIKK panel line") +
    ylab("Mean eggs produced per female") +
    scale_color_manual(values = pal_primary) +
    scale_fill_manual(values = pal_primary)
## Warning: Ignoring unknown aesthetics: label
violin_plot %>%
  plotly::ggplotly(height = 1200, width = 700) %>%
  reverse_legend_labels() %>% 
  layout(showlegend = F)
## Warning: Removed 518 rows containing non-finite values (stat_ydensity).

3 Semi-quantitative data

3.1 Read in data and tidy

in_file = here::here("data", "fecundity", "20210205_semiquantitative.xlsx")
df_semi = readxl::read_xlsx(in_file, range = "A1:C81")
out_file = here::here("data", "fecundity", "20210208_semiquant.csv")

# One sample is missing from this dataset. Which one?
df_quant$Strain[which(!df_quant$Strain %in% df_semi$Pair)]

[1] “91-1”

# Create recode vector
date_recode = c("Feb 2019", "Jul 2020")
names(date_recode) = c("2/19", "7/20")
recode_vec_1 = c(0, 1, 2, 3, 4, 5)
names(recode_vec_1) = c(0, "o", "x", "x/", "xx", "xxx")
recode_vec_2 = c("Not producing",
                 "Do not produce every day; <3 eggs when they do",
                 "Do not produce every day; <5 eggs when they do",
                 "0-3 eggs per day",
                 "0-5 eggs per day",
                 "5-10 eggs per day")
names(recode_vec_2) = c(0, 1, 2, 3, 4, 5)
recode_vec_3 = gsub("; ", ";\n", recode_vec_2)

# Tidy

semi_out = df_semi %>% 
  # pivot fecundity
  tidyr::pivot_longer(cols = contains("fecundity"), 
                      names_to = "DATE",
                      names_prefix = "fecundity ",
                      values_to = "FECUNDITY") %>% 
  # recode fecundity measures
  dplyr::mutate(DATE = dplyr::recode(DATE, !!!date_recode),
                FECUNDITY = dplyr::recode(FECUNDITY, "xx/" = "xx"),
                FECUNDITY = dplyr::na_if(FECUNDITY, "do not prod. Yet"),
                FECUNDITY = ifelse(is.na(FECUNDITY), 0, FECUNDITY),
                FECUNDITY = dplyr::recode(FECUNDITY, !!!factor(recode_vec_1)),
                KEY = dplyr::recode(FECUNDITY, !!!recode_vec_2)) %>%
  # rename STRAIN
  dplyr::rename(STRAIN = Pair) %>% 
  # factorise
  dplyr::mutate(STRAIN = factor(STRAIN, levels = strain_levels)) %>% 
  # write to file
  readr::write_csv(out_file, na = "")

knitr::kable(head(semi_out))
STRAIN DATE FECUNDITY KEY
4-1 Jul 2020 2 Do not produce every day; <5 eggs when they do
4-1 Feb 2019 2 Do not produce every day; <5 eggs when they do
4-2 Jul 2020 2 Do not produce every day; <5 eggs when they do
4-2 Feb 2019 4 0-5 eggs per day
5-1 Jul 2020 2 Do not produce every day; <5 eggs when they do
5-1 Feb 2019 4 0-5 eggs per day

3.2 Plot

semi_all = semi_out %>% 
  dplyr::mutate(STRAIN = factor(STRAIN, levels = rev(strain_levels)),
                KEY = gsub("; ", ";\n", KEY)) %>%
  ggplot() +
    geom_col(aes(STRAIN, KEY, fill = STRAIN)) +
    theme_bw() +
    scale_fill_manual(values = pal_primary) +
    facet_wrap(vars(DATE), ncol = 2) +
    guides(fill = F) +
    coord_flip() +
    theme(axis.text.x = element_text(size = 4)) +
    xlab("MIKK line") +
    ylab("Fecundity")

ggplotly(semi_all, width = 1200, height = 1000) %>% 
  reverse_legend_labels() %>% 
  layout(showlegend = F)

3.2.1 Correlation

semi_corr = semi_out %>% 
  tidyr::pivot_wider(id_cols = STRAIN,
                     names_from = DATE,
                     values_from = FECUNDITY) %>% 
  ggplot() +
    geom_jitter(aes(`Feb 2019`, `Jul 2020`, colour = STRAIN), alpha = 0.7) +
    coord_fixed() +
    theme_bw() +
    guides(colour = F) +
    ggtitle("Correlation in semi-quantitative measure") +
    scale_color_manual(values = pal_primary)

  
# Plotly
ggplotly(semi_corr, height = 400, width = 400) %>% 
  layout(showlegend = F)

3.2.2 Final quantitative

semi_final = semi_out %>% 
  dplyr::mutate(STRAIN = factor(STRAIN, levels = rev(strain_levels)),
                KEY = gsub("; ", ";\n", KEY)) %>% 
  dplyr::filter(DATE == "Jul 2020") %>% 
  ggplot() +
    geom_col(aes(STRAIN, KEY, fill = STRAIN)) +
    theme_bw() +
    scale_fill_manual(values = pal_primary) +
    guides(fill = F) +
    coord_flip() + 
    theme(axis.text.x = element_text(size = 4)) +
    xlab("MIKK panel line") +
    ylab("Fecundity") +
    ggtitle("Fecundity of the MIKK panel lines as of July 2020")

ggplotly(semi_final, width = 600, height = 1000, tooltip = c("STRAIN", "KEY")) %>% 
  reverse_legend_labels() %>% 
  layout(showlegend = F)

Horizontal

final_hor = semi_out %>% 
  dplyr::mutate(KEY = gsub("; ", ";\n", KEY),
                KEY = factor(KEY, levels = recode_vec_3)) %>% 
  dplyr::filter(DATE == "Jul 2020") %>% 
  ggplot() +
    geom_col(aes(STRAIN, KEY, fill = STRAIN)) +
    theme_bw() +
    scale_fill_manual(values = pal_primary) +
    guides(fill = F) +
    theme(axis.text.x = element_text(size = 3.5,
                                     angle = 45),
          axis.text.y = element_text(size = 5)) +
    xlab("MIKK panel line") +
    ylab(NULL) +
    ggtitle("Fecundity of the MIKK panel lines as of July 2020")

ggplotly(final_hor, width = 900, height = 300) %>% 
  layout(showlegend = F)